Setup

This document runs the sample representativity analyses reported in the supplement of the paper “Climate Change Engagement of Scientists”.

The combined_data.RDS file includes both the Scopus data and the survey data. We cannot make this data set publicly available, as it links the participants of our survey with data from Scopus, which includes exact citation counts, number of co-authors, first publication, last publication, h-index, etc. This would make it more likely that participants could be identified.

library(gt)
library(dplyr)
library(ggpubr)
library(tidyverse)
library(gtsummary)

# Constants for Plots and Tables
default_font_color <- '#444444' 
default_background_color <- 'white'
default_font_family <- 'Helvetica' 
default_font_size <- 15
default_na_col <- '#E22030'
default_plotly_height <- 550
default_plotly_width <- 800
colors <- c('#FC4E07','#00AFBB', '#E7B800', '#2cc990', '#E090DF', '#a0c4ff')

theme_minimal2 <- theme_minimal() +
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  )

# Convert haven_labelled columns to numeric
convert_to_numeric <- function(x) {
  if (inherits(x, 'haven_labelled')) {
    return(as.numeric(haven::zap_labels(x)))
  }
  return(x)
}

dat_all <- readRDS('../data/combined_data.RDS')
dat_all <- dat_all %>% mutate(across(everything(), convert_to_numeric))

# Replace -99 values with NA
dat_all[dat_all== -99] <- NA

dat_all <- dat_all %>%
    mutate(
      Survey_Finished = case_when(
        Progress >= 100 ~ T, # People who finished the survey
        Progress < 100 ~ F,  # People who did not finish the survey
        is.na(Progress) ~ F) # People who didn't start the survey
  ) %>% 
  select(
    ResponseId, H.Index, Number.of.Documents, Cited.By, Citation.Count,
    Co.Author.Count, First.Publication, Last.Publication, Continent,
    Time_Zone, Email_Wave, Survey_Finished, Progress, SurveySource
  )

# Make factors out of variables
factor_cols <- c(
  'ResponseId', 'Continent', 'Time_Zone',
  'Email_Wave', 'Survey_Finished', 'SurveySource'
)

dat_all[factor_cols] <- lapply(dat_all[factor_cols], as_factor) 

# Create data.frame for comparison 
comp <- rbind(
  transform(dat_all, Sample = 'Total Sample'),
  transform(dat_all[dat_all$Survey_Finished == T,], Sample = 'Survey')
)

Sample Representativity

The following section compares the total sample of invited scientists (N = 249,876) to the sample that actually participated in our study (n = 9,220).

Continent

This table shows the share of scientists for each continent for our Total Sample and the scientists that completed our survey. It can be observed that European scientists are considerably over-represented, whereas Asian scientists are clearly underrepresented. Scientists based in North America, South America, Oceania and Africa are only slightly underrepresented in our survey. Note that we do not have country / continent information for each participant.

# Table Continent
# Change unknown Continent to NA
levels(comp$Continent)[7] <- NA

t <- tbl_cross(comp,
          row = Continent,
          col = Sample,
          percent = 'col',
          missing = 'no',
          label = list(Sample ~ 'Sample Comparison',
                       Continent ~ 'Continent')) %>%
  modify_column_hide(columns = stat_0) %>% # Removes Total Column
  bold_labels() 

t
Sample Comparison
Survey Total Sample
Continent

    Asia 1,072 (12%) 93,958 (38%)
    Europe 4,691 (51%) 77,916 (31%)
    North America 2,559 (28%) 63,365 (25%)
    Oceania 470 (5.1%) 8,189 (3.3%)
    South America 287 (3.1%) 3,808 (1.5%)
    Africa 110 (1.2%) 2,017 (0.8%)
Total 9,189 (100%) 249,253 (100%)

Year of First Publication

# Refactor Sample so that Survey Population will be in front
comp$Sample <- factor(comp$Sample, levels = c('Total Sample', 'Survey'))

# Year of First publication
p_year <- comp %>%
  ggplot(aes(x = First.Publication, fill = Sample, col = Sample)) + 
  geom_density(alpha = 0.6) +
  # Add Median for total Sample
  geom_vline(data = comp[comp$Sample == 'Total Sample',] , 
             aes(xintercept = median(First.Publication, na.rm = T)),
             linetype = 'longdash',
             color = colors[1]) + 
  # Add Median for people who finished the survey
  geom_vline(data = comp[comp$Sample == 'Survey',] , 
             aes(xintercept = median(First.Publication, na.rm = T)),
             linetype = 'dashed',
             color = colors[2]) + 
  scale_fill_manual(values = colors[1:2])+
  scale_color_manual(values = colors[1:2]) +
  scale_x_continuous(breaks = seq(1960,2025,5),
                     limits = c(1960, 2025)) +
  labs(title = 'Sample Comparison',
       subtitle = 'Lines indicate the Median',
       x = 'Year of first publication') +
  theme_minimal2 

p_year

H-Index Scopus

# H-Index
p_hindex <- comp %>%
  ggplot(aes(x = H.Index, fill = Sample, col = Sample)) + 
  geom_density(alpha = 0.6) +
  # Add Median for total Sample
  geom_vline(data = comp[comp$Sample == 'Total Sample',] , 
             aes(xintercept = median(H.Index, na.rm = T)),
             linetype = 'dashed',
             color = colors[1]) + 
  # Add Median for people who finished the survey
  geom_vline(data = comp[comp$Sample == 'Survey',] , 
             aes(xintercept = median(H.Index, na.rm = T)),
             linetype = 'dashed',
             color = colors[2]) + 
  scale_fill_manual(values = colors[1:2])+
  scale_color_manual(values = colors[1:2]) +
  scale_x_continuous(breaks = seq(0,125,10),
                     limits = c(0, 125)) +
  labs(title='Sample Comparison',
       subtitle = 'Lines indicate the Median',
       x = 'H-Index') +
  theme_minimal2 

p_hindex

Number of Articles

# Number of Articles
p_articles <- comp %>%
  ggplot(aes(x = Number.of.Documents, fill = Sample, col = Sample)) + 
  geom_density(alpha = 0.6) +
  # Add Median for total Sample
  geom_vline(data = comp[comp$Sample == 'Total Sample',] , 
             aes(xintercept = median(Number.of.Documents, na.rm = T)),
             linetype = 'dashed',
             color = colors[1]) + 
  # Add Median for people who finished the survey
  geom_vline(data = comp[comp$Sample == 'Survey',] , 
             aes(xintercept = median(Number.of.Documents, na.rm = T)),
             linetype = 'dashed',
             color = colors[2]) + 
  scale_fill_manual(values = colors[1:2])+
  scale_color_manual(values = colors[1:2]) +
  scale_x_continuous(breaks = seq(0,400,50),
                     limits = c(0, 400)) +
  labs(title = 'Sample Comparison',
       subtitle = 'Lines indicate the Median',
       x = 'Number of Documents authored') +
  theme_minimal2 

p_articles

Citations

# Number of Citations
p_citations <- comp %>%
  ggplot(aes(x = Citation.Count, fill = Sample, col = Sample)) + 
  geom_density(alpha = 0.6) +
  # Add Median for total Sample
  geom_vline(data = comp[comp$Sample == 'Total Sample',] , 
             aes(xintercept = median(Citation.Count, na.rm = T)),
             linetype = 'dashed',
             color = colors[1]) + 
  # Add Median for people who finished the survey
  geom_vline(data = comp[comp$Sample == 'Survey',] , 
             aes(xintercept = median(Citation.Count, na.rm = T)),
             linetype = 'dashed',
             color = colors[2]) + 
  scale_fill_manual(values = colors[1:2])+
  scale_color_manual(values = colors[1:2]) +
  scale_x_continuous(breaks = seq(0,6000,500),
                     limits = c(0, 6000)) +
  labs(title = 'Sample Comparison',
       subtitle = 'Lines indicate the Median',
       x = 'Total Number of Citations') +
  theme_minimal2 

p_citations

Cited By

# Citing Authors
p_citedby <- comp %>%
  ggplot(aes(x = Cited.By, fill = Sample, col = Sample)) + 
  geom_density(alpha = 0.6) +
  # Add Median for total Sample
  geom_vline(data = comp[comp$Sample == 'Total Sample',] , 
             aes(xintercept = median(Cited.By, na.rm = T)),
             linetype = 'dashed',
             color = colors[1]) + 
  # Add Median for people who finished the survey
  geom_vline(data = comp[comp$Sample == 'Survey',] , 
             aes(xintercept = median(Cited.By, na.rm = T)),
             linetype = 'dashed',
             color = colors[2]) + 
  scale_fill_manual(values = colors[1:2])+
  scale_color_manual(values = colors[1:2]) +
  scale_x_continuous(breaks = seq(0,4000,500),
                    limits = c(0, 4000)) +
  labs(title = 'Sample Comparison',
       subtitle = 'Lines indicate the Median',
       x = 'Total Number of Citing Authors') +
  theme_minimal2 

p_citedby

Co-Author Count

# Number of Co-Authors
p_coauthors <- comp %>%
  ggplot(aes(x = Co.Author.Count, fill = Sample, col = Sample)) + 
  geom_density(alpha = 0.6) +
  # Add Median for total Sample
  geom_vline(data = comp[comp$Sample == 'Total Sample',] , 
             aes(xintercept = median(Co.Author.Count, na.rm = T)),
             linetype = 'dashed',
             color = colors[1]) + 
  # Add Median for people who finished the survey
  geom_vline(data = comp[comp$Sample == 'Survey',] , 
             aes(xintercept = median(Co.Author.Count, na.rm = T)),
             linetype = 'dashed',
             color = colors[2]) + 
  scale_fill_manual(values = colors[1:2])+
  scale_color_manual(values = colors[1:2]) +
  scale_x_continuous(breaks = seq(0,600,50),
                    limits = c(0, 600)) +
  labs(title='Sample Comparison',
       subtitle = 'Lines indicate the Median',
       x = 'Total Number of Co-Authors') +
  theme_minimal2 

p_coauthors

Combined Figure

Combined Table: Mean

# Overview the Sample
comp$Sample <- factor(comp$Sample, levels = c('Survey', 'Total Sample'))

# Mean
t_mean <- comp %>%
  select(Sample, H.Index, Number.of.Documents, Cited.By, Citation.Count, Co.Author.Count, First.Publication,
         Continent) %>%
  tbl_summary(by = Sample,
              missing = 'no',
              type = where(is.numeric) ~ 'continuous', 
              statistic = list(all_categorical() ~ '{n} ({p}%)',
                               all_continuous() ~ '{mean} ({sd})'))

t_mean
Characteristic Survey, N = 9,2201 Total Sample, N = 249,8761
H.Index 21 (20) 23 (21)
Number.of.Documents 74 (117) 89 (124)
Cited.By 2,717 (6,953) 3,014 (6,860)
Citation.Count 3,774 (10,315) 4,145 (10,065)
Co.Author.Count 338 (1,136) 351 (967)
First.Publication 2,006 (12) 2,006 (11)
Continent

    Asia 1,072 (12%) 93,958 (38%)
    Europe 4,691 (51%) 77,916 (31%)
    North America 2,559 (28%) 63,365 (25%)
    Oceania 470 (5.1%) 8,189 (3.3%)
    South America 287 (3.1%) 3,808 (1.5%)
    Africa 110 (1.2%) 2,017 (0.8%)
1 Mean (SD); n (%)

Combined Table: Median

# Median
t_med <- comp %>%
  select(Sample, H.Index, Number.of.Documents, Cited.By, Citation.Count, Co.Author.Count, First.Publication,
         Continent) %>%
  tbl_summary(by = Sample,
              missing = 'no',
              type = where(is.numeric) ~ 'continuous', 
              statistic = list(all_categorical() ~ '{n} ({p}%)',
                               all_continuous() ~ '{median} ({sd})'))
t_med
Characteristic Survey, N = 9,2201 Total Sample, N = 249,8761
H.Index 15 (20) 18 (21)
Number.of.Documents 36 (117) 49 (124)
Cited.By 710 (6,953) 1,002 (6,860)
Citation.Count 871 (10,315) 1,243 (10,065)
Co.Author.Count 97 (1,136) 125 (967)
First.Publication 2,009 (12) 2,008 (11)
Continent

    Asia 1,072 (12%) 93,958 (38%)
    Europe 4,691 (51%) 77,916 (31%)
    North America 2,559 (28%) 63,365 (25%)
    Oceania 470 (5.1%) 8,189 (3.3%)
    South America 287 (3.1%) 3,808 (1.5%)
    Africa 110 (1.2%) 2,017 (0.8%)
1 Median (SD); n (%)